*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*
* Replication Files to: P-hacking in clinical trials and how incentives shape the distribution of results across phases 
* Date: April 2020
* Authors: Jérôme Adda, Christian Decker, and Marco Ottaviani
*
*	- Input: Interim Data generated in 2
* 			 		 
*	- Output: Interim Data & 'linked_phases.dta'
*			 
* Topic: Linking Phase II & Phase III Trials
*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*

*------------------------*
* Section 0 -- Directory *
*------------------------*

clear all
clear matrix
cap log close
set more off

*-----*
* 0.1 *
*-----*

* Run the dofiles which defines all the globals for the folder's and sub-folders
* paths to both data and analysis.


if c(username) == "chrdec" {													
	do "C:/Users/chrdec/Dropbox/clinical trials/stata/PNAS revision/2_analysis/dofiles/0_globals.do"   			// 0_globals.do path

}


*-----*
* 0.2 * 
*-----*

* Set up the directory where data are stored
macro list
cd "${data}"
set matsize 11000

cap log off
log using "${run}${log}log_3.txt", replace

*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~

*-------------------*
* Table of content: *
*-------------------*
* Sections: 
* 1. Prepare Data
* 2. Matching 


*---------------------------*
* Section 1 -- Prepare Data *
*---------------------------*

*-----*
* 1.1 *
*-----*

* Prepare Conditions Data

use "${data}${inter}browse_conditions.dta", clear
rename mesh_term condition
keep nct_id condition

drop if condition=="Disease"
replace condition=lower(condition)
replace condition=stritrim(condition)
replace condition=strltrim(condition)
replace condition=strrtrim(condition)
sort nct_id condition
by nct_id: gen n=_n
tab n
drop if n>10
reshape wide condition, i(nct_id) j(n)
save "${data}${inter}conditions_mesh_wide.dta", replace

*-----*
* 1.2 *
*-----*

* Merge Tables and Generate Variables

use nct_id start_date study_type phase overall_status completion_date using "${data}${inter}studies.dta", clear
keep if study_type=="Interventional"
keep if phase=="Phase 3"
gen start_year=substr(start_date,1,4)
destring start_year, replace
gen start_month=substr(start_date,6,2)
destring start_month, replace
gen completion_year=substr(completion_date,1,4)
destring completion_year, replace
gen completion_month=substr(completion_date,6,2)
destring completion_month, replace
gen applicable=1 if (overall_status=="Completed" | overall_status=="Unknown status" | overall_status=="Terminated") & completion_year<=2018
replace applicable=0 if applicable==.


merge 1:m nct_id using "${data}${inter}data_counterfactual_analysis_interim.dta", keepusing(p_provided)
drop if _merge==2
drop _merge
replace p_provided=0 if p_provided==.

collapse (max) start_year start_month p_provided applicable, by(nct_id)


merge 1:m nct_id using "${data}${inter}interventions.dta"
rename id intervention_id
rename name intervention_name
replace intervention_name=lower(intervention_name)
keep if _merge==3
drop _merge
keep if intervention_type=="Drug"

merge 1:m intervention_id using "${data}${inter}intervention_other_names.dta"
drop if _merge==2


*-----*
* 1.3 *
*-----*

* Clean Intervention Strings

sort intervention_id id
by intervention_id: gen n2=_n
tab n2,m
drop if n2>50
rename other_name a
replace a=lower(a)
drop _merge id
reshape wide a, i(intervention_id) j(n2) 

egen aux = concat(intervention_name a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a14 a15 a17 a18 a19 a20 a21 a22 a23 a24 a25 a26 a27 a28 a29 a30 a31 a32 a33 a34 a35 a36 a37 a38 a39 a40 a41 a42 a43 a44 a45 a46 a47 a48 a49 a50), punct(" ")

replace aux=subinstr(aux,"alfa","alpha",10)
replace aux=subinstr(aux,"®"," ",10)
replace aux=subinstr(aux,`"""'," ",10)
replace aux=subinstr(aux,"?"," ",10)
replace aux=subinstr(aux,"&"," ",10)
replace aux=subinstr(aux,"â"," ",10)
replace aux=subinstr(aux,"Â"," ",10)
replace aux=subinstr(aux,"Ã"," ",10)
replace aux=stritrim(aux)
replace aux=strltrim(aux)
replace aux=strrtrim(aux)
keep nct_id aux start_year start_month intervention_id p_provided applicable
sort nct_id intervention_id
by nct_id: gen n=_n
tab n,m
drop if n>30
drop intervention_id
reshape wide aux, i(nct_id) j(n) 
egen aux = concat(aux1 aux2 aux3 aux4 aux5 aux7 aux8 aux10 aux11 aux12 aux13 aux15 aux16 aux17 aux18 aux19 aux20 aux21 aux22 aux23 aux24 aux25 aux26 aux27 aux28 aux29 aux30), punct(" ")
keep nct_id aux start_year start_month p_provided applicable
replace aux=stritrim(aux)
replace aux=strltrim(aux)
replace aux=strrtrim(aux)

*-----*
* 1.4 *
*-----*

* Append Hand-Collected Phase 2 Intervention Data and Initialize Match Variables

append using "${data}${raw}phase2_interventions_hand_collected.dta"
gen match=0 if id!=.
gen n_match=0 if id!=.
gen n_appl=0 if id!=.
gen n_p=.

gen match_nct="" if id==.


*-----*
* 1.5 *
*-----*

* Merge Conditions and Save

merge m:1 nct_id using "${data}${inter}conditions_mesh_wide.dta"
drop if _merge==2
drop _merge
egen aux_condition = concat(condition1 condition2 condition3 condition4 condition5 condition6 condition7 condition8 condition9 condition10) if id==. , punct("|")

save "${data}${inter}matching_input.dta", replace

*-----------------------*
* Section 2 -- Matching *
*-----------------------*

set more off 
use "${data}${inter}matching_input", clear

sort id
forval idn = 1 / 1941 {
disp in red "id  `idn'"
local word1=drug1[`idn']
local word2=drug2[`idn']
local word3=drug3[`idn']
local word4=drug4[`idn']
local word5=drug5[`idn']
local c1=condition1[`idn']
local c2=condition2[`idn']
local c3=condition3[`idn']
local c4=condition4[`idn']
local c5=condition5[`idn']
local c6=condition6[`idn']
local c7=condition7[`idn']
local c8=condition8[`idn']
local c9=condition9[`idn']
local c10=condition10[`idn']
local start_year=start_year[`idn']
local start_month=start_month[`idn']
local mnct=nct_id[`idn']
qui gen f_int=0 if id==.
qui gen f_cond=0 if id==.
qui gen f_date=0 if id==.
qui replace f_int=1 if regexm(aux,"`word1'")==1 & regexm(aux,"`word2'")==1 & regexm(aux,"`word3'")==1 & regexm(aux,"`word4'")==1 & regexm(aux,"`word5'")==1
qui replace f_cond=1 if regexm(aux_condition,"`c1'")*strlen("`c1'")>0 & regexm(aux_condition,"`c2'")==1 & regexm(aux_condition,"`c3'")==1 & regexm(aux_condition,"`c4'")==1 & regexm(aux_condition,"`c5'")==1 & regexm(aux_condition,"`c6'")==1 & regexm(aux_condition,"`c7'")==1 & regexm(aux_condition,"`c8'")==1 & regexm(aux_condition,"`c9'")==1 & regexm(aux_condition,"`c10'")==1
qui replace f_date=1 if (start_year>`start_year') | (start_year==`start_year' & start_month>=`start_month') | start_year==.

qui gen f=f_int*f_cond*f_date

qui egen sf=sum(f)

qui egen mf=max(f)
qui replace n_match=sf if id==`idn'
qui replace match=1 if id==`idn' & mf==1

qui replace match_nct=match_nct+" `mnct'" if id==. & f==1
drop f_int f_cond f_date mf sf f
}

gen Ph2=1 if id!=.

collapse (sum) n_match (max) Ph2 matched=match (first) match_nct, by(nct_id)

la var Ph2 "phase 2"
la var nct_id "phase 2 nct_id"
la var matched "matching phase 3 trial found"
la var n_match "number linked phase 3 trials"
la var match_nct "nct_ids of phase 2 trials linked to this phase 3 trial"

replace n_match=. if Ph2==.

save "${data}${final}linked_phases.dta", replace


log close 
cd "${run}${dofiles}"
